Skip to content

Add modular pipeline for HunyuanVideo 1.5#13389

Open
akshan-main wants to merge 14 commits intohuggingface:mainfrom
akshan-main:modular-hunyuan1.5
Open

Add modular pipeline for HunyuanVideo 1.5#13389
akshan-main wants to merge 14 commits intohuggingface:mainfrom
akshan-main:modular-hunyuan1.5

Conversation

@akshan-main
Copy link
Copy Markdown

@akshan-main akshan-main commented Apr 2, 2026

What does this PR do?

Adds modular pipeline blocks for HunyuanVideo 1.5 with both text-to-video (HunyuanVideo15Blocks) and image-to-video (HunyuanVideo15Image2VideoBlocks).

Parity verified on Colab G4 GPU:

  • T2V: MAD 0.000000 vs HunyuanVideo15Pipeline
hv15_t2v_standard.mp4
hv15_t2v_modular.mp4
T2V reproduction code
import gc
import numpy as np
import torch
from diffusers import (
    HunyuanVideo15Pipeline,
    HunyuanVideo15ImageToVideoPipeline,
    HunyuanVideo15Blocks,
    HunyuanVideo15ModularPipeline,
)
from diffusers.utils import load_image, export_to_video

device = "cuda"
dtype = torch.bfloat16

T2V_ID = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v"
I2V_ID = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_i2v"

def to_np(x):
    if hasattr(x, "frames"):
        x = x.frames
    if isinstance(x, list):
        x = np.array(x)
    if isinstance(x, torch.Tensor):
        x = x.float().cpu().numpy()
    return x
prompt = "A cinematic drone shot over snowy mountains at sunrise."

print("=== Standard T2V ===")

ref_pipe = HunyuanVideo15Pipeline.from_pretrained(T2V_ID, torch_dtype=dtype).to(device)
g = torch.Generator(device=device).manual_seed(1234)
ref_out = ref_pipe(prompt=prompt, num_frames=55, num_inference_steps=6, generator=g, output_type="np").frames
print(f"Shape: {np.array(ref_out).shape}")
export_to_video(ref_out[0], "/content/hv15_t2v_standard.mp4", fps=24)
del ref_pipe; gc.collect(); torch.cuda.empty_cache()



print("\n=== Modular T2V ===")
blocks = HunyuanVideo15Blocks()
pipe = blocks.init_pipeline(T2V_ID)
pipe.load_components(torch_dtype=dtype)
pipe.to(device)

print("Guider type:", type(pipe.guider).__name__)
print("Guider scale:", pipe.guider.guidance_scale)
print("Guider enabled:", pipe.guider._enabled)
print("Guider num_conditions:", pipe.guider.num_conditions)
g = torch.Generator(device=device).manual_seed(1234)
mod_out = pipe(prompt=prompt, num_frames=55, num_inference_steps=6, generator=g, output="videos", output_type="np")
print(f"Shape: {np.array(mod_out).shape}")
export_to_video(mod_out[0], "/content/hv15_t2v_modular.mp4", fps=24)

diff = np.abs(to_np(ref_out).astype(float) - to_np(mod_out).astype(float)).mean()
print(f"\nT2V MAD: {diff:.6f}")
del pipe, blocks; gc.collect(); torch.cuda.empty_cache()
  • I2V: MAD 0.000000 vs HunyuanVideo15ImageToVideoPipeline
hv15_i2v_standard.mp4
hv15_i2v_modular.mp4
I2V reproduction code
from diffusers.modular_pipelines import HunyuanVideo15Blocks, HunyuanVideo15Image2VideoBlocks, HunyuanVideo15ModularPipeline

image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png").convert("RGB")

print("=== Standard I2V ===")
ref_pipe = HunyuanVideo15ImageToVideoPipeline.from_pretrained(I2V_ID, torch_dtype=dtype).to(device)
g = torch.Generator(device=device).manual_seed(1234)
ref_out = ref_pipe(image=image, prompt="A cat turns its head", num_frames=55, num_inference_steps=6, generator=g, output_type="np").frames
print(f"Shape: {np.array(ref_out).shape}")
export_to_video(ref_out[0], "/content/hv15_i2v_standard.mp4", fps=24)
del ref_pipe; gc.collect(); torch.cuda.empty_cache()

print("\n=== Modular I2V ===")
blocks = HunyuanVideo15Image2VideoBlocks()
pipe = blocks.init_pipeline(I2V_ID)
pipe.load_components(torch_dtype=dtype)
pipe.to(device)
g = torch.Generator(device=device).manual_seed(1234)
mod_out = pipe(image=image, prompt="A cat turns its head", num_frames=55, num_inference_steps=6, generator=g, output="videos", output_type="np")
print(f"Shape: {np.array(mod_out).shape}")
export_to_video(mod_out[0], "/content/hv15_i2v_modular.mp4", fps=24)

diff = np.abs(to_np(ref_out).astype(float) - to_np(mod_out).astype(float)).mean()
print(f"\nI2V MAD: {diff:.6f}")
print("\n=== Done ===")

Addresses #13295 (HunyuanVideo 1.5 contribution)

Before submitting

Who can review?

@sayakpaul @yiyixuxu @asomoza

@akshan-main
Copy link
Copy Markdown
Author

hey guys @yiyixuxu @sayakpaul would greatly appreciate a review!

@sayakpaul
Copy link
Copy Markdown
Member

@claude could you do an initial review here?

@github-actions
Copy link
Copy Markdown
Contributor

github-actions bot commented Apr 6, 2026

Claude Code is working…

I'll analyze this and get back to you.

View job run

@akshan-main
Copy link
Copy Markdown
Author

@sayakpaul looks like the Claude bot run failed on this one

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants